library(ggplot2)
library(tidyverse)

7.3 Variation

diamonds %>% count(cut) # count 
## # A tibble: 5 × 2
##         cut     n
##       <ord> <int>
## 1      Fair  1610
## 2      Good  4906
## 3 Very Good 12082
## 4   Premium 13791
## 5     Ideal 21551
ggplot(diamonds) + geom_bar(aes(x = cut))

diamonds %>%
    count(cut_width(carat, .5))
## # A tibble: 11 × 2
##    `cut_width(carat, 0.5)`     n
##                     <fctr> <int>
## 1             [-0.25,0.25]   785
## 2              (0.25,0.75] 29498
## 3              (0.75,1.25] 15977
## 4              (1.25,1.75]  5313
## 5              (1.75,2.25]  2002
## 6              (2.25,2.75]   322
## 7              (2.75,3.25]    32
## 8              (3.25,3.75]     5
## 9              (3.75,4.25]     4
## 10             (4.25,4.75]     1
## 11             (4.75,5.25]     1
ggplot(diamonds) + geom_histogram(aes(x = carat), binwidth = .5)

7.3.2 Typical values

diamonds %>% 
    filter(carat < 3) %>% 
    ggplot() + 
        geom_histogram(aes(x = carat), binwidth = .01)

*the plot shows that diamonds whose carat is immidiately below 1 or 2 is very small. this implies that integer is important cutpoints for diamond carat

7.3,3 Unusual values

ggplot(diamonds) + 
  geom_histogram(mapping = aes(x = y), binwidth = 0.5) +
  coord_cartesian(ylim = c(0, 50)) # zoom in y 0:50

diamonds %>%
    filter(y < 3 | y > 20) %>%
    dplyr::select(price, x, y, z) %>%
    arrange(y)
## # A tibble: 9 × 4
##   price     x     y     z
##   <int> <dbl> <dbl> <dbl>
## 1  5139  0.00   0.0  0.00
## 2  6381  0.00   0.0  0.00
## 3 12800  0.00   0.0  0.00
## 4 15686  0.00   0.0  0.00
## 5 18034  0.00   0.0  0.00
## 6  2130  0.00   0.0  0.00
## 7  2130  0.00   0.0  0.00
## 8  2075  5.15  31.8  5.12
## 9 12210  8.09  58.9  8.06

7.3.4 Exercises

1:2

ggplot(diamonds, aes(x = x)) +
    geom_histogram(binwidth = .05)

# price around 1500 does not exist

3

diamonds %>%
    filter(carat == .99)
## # A tibble: 23 × 10
##    carat       cut color clarity depth table price     x     y     z
##    <dbl>     <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1   0.99      Fair     I     SI2  68.1    56  2811  6.21  6.06  4.18
## 2   0.99      Fair     J     SI1  55.0    61  2812  6.72  6.67  3.68
## 3   0.99      Fair     J     SI1  58.0    67  2949  6.57  6.50  3.79
## 4   0.99      Fair     I     SI1  60.7    66  3337  6.42  6.34  3.87
## 5   0.99      Fair     H     VS2  71.6    57  3593  5.94  5.80  4.20
## 6   0.99 Very Good     J     SI1  60.3    57  4002  6.44  6.49  3.90
## 7   0.99      Good     F     SI2  63.3    54  4052  6.36  6.43  4.05
## 8   0.99   Premium     F     SI2  60.6    61  4075  6.45  6.38  3.89
## 9   0.99     Ideal     I     SI1  61.8    57  4763  6.40  6.42  3.96
## 10  0.99 Very Good     E     SI2  61.8    59  4780  6.30  6.33  3.90
## # ... with 13 more rows
diamonds %>%
    filter(carat == 1)
## # A tibble: 1,558 × 10
##    carat     cut color clarity depth table price     x     y     z
##    <dbl>   <ord> <ord>   <ord> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
## 1      1 Premium     I     SI2  58.2    60  2795  6.61  6.55  3.83
## 2      1 Premium     J     SI2  62.3    58  2801  6.45  6.34  3.98
## 3      1    Fair     G      I1  66.4    59  2808  6.16  6.09  4.07
## 4      1    Fair     J     VS2  65.7    59  2811  6.14  6.07  4.01
## 5      1 Premium     H      I1  61.3    60  2818  6.43  6.39  3.93
## 6      1    Fair     H     SI2  65.3    62  2818  6.34  6.12  4.08
## 7      1 Premium     F      I1  58.9    60  2841  6.60  6.55  3.87
## 8      1    Fair     G     SI2  67.8    61  2856  5.96  5.90  4.02
## 9      1    Fair     I     SI1  67.9    62  2856  6.19  6.03  4.15
## 10     1    Fair     H     SI2  66.1    56  2856  6.21  5.97  4.04
## # ... with 1,548 more rows
diamonds %>%
    mutate(id = row_number()) %>%
    dplyr::select(x, y, z, id) %>%
    gather(variable, value, -id) # melt with `x,y,z` without `id` while remaining it as `id`
## # A tibble: 161,820 × 3
##       id variable value
##    <int>    <chr> <dbl>
## 1      1        x  3.95
## 2      2        x  3.89
## 3      3        x  4.05
## 4      4        x  4.20
## 5      5        x  4.34
## 6      6        x  3.94
## 7      7        x  3.95
## 8      8        x  4.07
## 9      9        x  3.87
## 10    10        x  4.00
## # ... with 161,810 more rows
  # the first argument(`variable`) take the colnames and the second argument (`value`) takes the values of the argument for each row

diamonds %>%
    mutate(id = row_number()) %>%
    dplyr::select(x, y, z, id) %>%
    gather(variable, value, -id) %>%
    ggplot(aes(x = value)) + geom_density() + facet_grid(variable ~ .) + geom_rug()

  • geom_rug() gives you the each case, marginal distibutions
  • facet_grid display for each variable, you can use facet_grid(~variable) instead too

4

ggplot(diamonds) + 
    geom_histogram(mapping = aes(x = price)) +
    coord_cartesian(xlim = c(100, 5000), ylim = c(0, 3000))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(diamonds) + 
    geom_histogram(mapping = aes(x = price)) +
    xlim(100, 5000) + ylim(0, 3000)
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 14714 rows containing non-finite values (stat_bin).
## Warning: Removed 5 rows containing missing values (geom_bar).

coord_cartesian simply zoom while xlim and ylim drop the values exceed the given limits and draw the graph

7.4 Missing Value

# return `NA` if the value matches the logical vector in the first argument
diamonds2 <- diamonds %>%
  mutate(y = ifelse(y < 3 | y > 20, NA, y))

ggplot(data = diamonds2, mapping = aes(x = x, y = y)) +
  geom_point() # or alternatively geom_point(na.rm = T)
## Warning: Removed 9 rows containing missing values (geom_point).

nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time),
    sched_hour = sched_dep_time %/% 100,
    sched_min = sched_dep_time %/% 100,
    sched_dep_time = sched_hour + sched_min / 60
  ) %>%
  ggplot(mapping = aes(sched_dep_time)) +
    geom_freqpoly(mapping = aes(color = cancelled), binwidth = 1/4)

7.4.1 Exercises

  • In the bar chart (using geom_bar), NA is regarded as a new category.

7.5 Covariation

7.5.1 Acategorical and continuous variable

# price毎にdensity plotを作成する
ggplot(data = diamonds, mapping = aes(x = price, y = ..density..)) +
  geom_freqpoly(mapping = aes(color = cut), binwidth = 500)

# cut毎に、priceの位置をboxplotした
ggplot(data = diamonds, mapping = aes(x = cut, y = price)) +
  geom_boxplot()

7.5.1.1 Exercises

1

nycflights13::flights %>%
  mutate(
    cancelled = is.na(dep_time)
  ) %>%
  ggplot(mapping = aes(x = cancelled, y = sched_dep_time)) + geom_boxplot()

3

ggplot(data = mpg) +
  geom_boxplot(mapping = aes(x = reorder(class, hwy, FUN = median), y = hwy)) +
  coord_flip()

7.5.2 Two categorical variables

ggplot(data = diamonds) +
  geom_count(mapping = aes(x = cut, y = color))

diamonds %>%
  count(color, cut) %>%
  ggplot(mapping = aes(x = color, y = cut)) +
    geom_tile(mapping = aes(fill = n))

7.5.2.1 Excersises

1

# the two codes below provide same results
diamonds %>%
  count(color, cut)
## Source: local data frame [35 x 3]
## Groups: color [?]
## 
##    color       cut     n
##    <ord>     <ord> <int>
## 1      D      Fair   163
## 2      D      Good   662
## 3      D Very Good  1513
## 4      D   Premium  1603
## 5      D     Ideal  2834
## 6      E      Fair   224
## 7      E      Good   933
## 8      E Very Good  2400
## 9      E   Premium  2337
## 10     E     Ideal  3903
## # ... with 25 more rows
diamonds %>%
  group_by(color, cut) %>%
  dplyr::summarise(n = n())
## Source: local data frame [35 x 3]
## Groups: color [?]
## 
##    color       cut     n
##    <ord>     <ord> <int>
## 1      D      Fair   163
## 2      D      Good   662
## 3      D Very Good  1513
## 4      D   Premium  1603
## 5      D     Ideal  2834
## 6      E      Fair   224
## 7      E      Good   933
## 8      E Very Good  2400
## 9      E   Premium  2337
## 10     E     Ideal  3903
## # ... with 25 more rows

2

nycflights13::flights %>%
  group_by(month, dest) %>%
  dplyr::summarise(dep_delay = mean(dep_delay, na.rm = T)) %>%
  group_by(dest) %>% # この前でungroup()してもいい
  filter(n() == 12) # 12のフライトがあるdestのみを抽出 = 毎月フライトがあるdestを抽出。上記でgroup_by
## Source: local data frame [1,008 x 3]
## Groups: dest [84]
## 
##    month  dest dep_delay
##    <int> <chr>     <dbl>
## 1      1   ALB 41.396825
## 2      1   ATL  4.471918
## 3      1   AUS 10.301775
## 4      1   BDL 21.108108
## 5      1   BHM 30.347826
## 6      1   BNA 13.227390
## 7      1   BOS  5.235004
## 8      1   BQN  5.870968
## 9      1   BTV 11.702703
## 10     1   BUF 13.796163
## # ... with 998 more rows

7.5.3 Two continuous variables

ggplot(data = diamonds) + 
  geom_point(mapping = aes(x = carat, y = price), alpha = .01)

7.5.3.1

1

# one option
# cut_width determine the cut points for changing color 
# in this example by 0.3 each, [0.45, 0.75], [0.75, 1.05]
ggplot(data = diamonds) +
  geom_freqpoly(aes(x = price, color = cut_width(carat, .3)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
  geom_freqpoly(aes(x = price, y = ..density.., color = cut_width(carat, .3)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

# another option
# cut_number determine the cut points based on the number of observation in each cluster
# in this case, each cluster of the size (carat) has 10 observations
ggplot(data = diamonds) +
  geom_freqpoly(aes(x = price, color = cut_number(carat, 10)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = diamonds) +
  geom_freqpoly(aes(x = price, y = ..density..,  color = cut_number(carat, 10)))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

2

# `x` of boxplot control the number to be plot
# `y` of boxplot control the partion
# weird
ggplot(data = diamonds) +
  geom_boxplot(aes(x = carat, y = cut_number(price, 10)))
## Warning: position_dodge requires non-overlapping x intervals

# good
ggplot(data = diamonds) +
  geom_boxplot(aes(y = carat, x = cut_number(price, 10))) +
  coord_flip()

4

# hexagonal heatmap 
ggplot(diamonds, aes(x = carat, y = price)) +
  geom_hex() +
  facet_wrap(~ cut, ncol = 1) 

7.6

library(modelr)

mod <- lm(log(price) ~ log(carat), diamonds)

diamonds3 <- diamonds %>%
  add_residuals(mod) %>%
  mutate(resid = exp(resid))

ggplot(diamonds3) +
  geom_point(mapping = aes(x = carat, y = resid))

ggplot(diamonds3) +
  geom_boxplot(aes(x = cut, y = resid))

ggplot(diamonds3) +
  geom_boxplot(aes(x = cut, y = carat))

ggplot(diamonds3) +
  geom_boxplot(aes(x = cut, y = price))